import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import arff


def getdata():
    cm1 = arff.loadarff('arff/CM1.arff')
    cm1 = pd.DataFrame(cm1[0])
    pc1 = arff.loadarff('arff/PC1.arff')
    pc1 = pd.DataFrame(pc1[0])
    pc2 = arff.loadarff('arff/PC2.arff')
    pc2 = pd.DataFrame(pc2[0])
    pc3 = arff.loadarff('arff/PC3.arff')
    pc3 = pd.DataFrame(pc3[0])
    pc4 = arff.loadarff('arff/PC4.arff')
    pc4 = pd.DataFrame(pc4[0])
    pc5 = arff.loadarff('arff/PC5.arff')
    pc5 = pd.DataFrame(pc5[0])
    cm1.isnull().values.any()
    pc1.isnull().values.any()
    pc2.isnull().values.any()
    pc3.isnull().values.any()
    pc4.isnull().values.any()
    pc5.isnull().values.any()
    cm1 = cm1.fillna(value=0)
    pc1 = pc1.fillna(value=0)
    pc2 = pc2.fillna(value=0)
    pc3 = pc3.fillna(value=0)
    pc4 = pc4.fillna(value=0)
    pc5 = pc5.fillna(value=0)

    # pc1.isnull().values.any()  # Gives false ie:No null value in dataset
    # pc3.isnull().values.any()
    # pc4.isnull().values.any()
    # # 使用False进行填充
    # pc1 = pc1.fillna(value=False)
    # pc3 = pc3.fillna(value=False)
    # pc4 = pc4.fillna(value=False)
    result = pd.concat([cm1, pc1, pc2, pc3, pc4, pc5], axis=0)
    import seaborn as sns
    import matplotlib.pyplot as plt
    # sns.set()
    # # 获取各列之间的相关系数
    # corr = combined_training_data.corr()
    # # 画热力图
    # ax = sns.heatmap(corr, xticklabels=corr.columns,yticklabels=corr.columns)

    result = result.dropna(axis=1)
    result = result.drop(
        ['CONDITION_COUNT', 'CYCLOMATIC_COMPLEXITY', 'HALSTEAD_ERROR_EST', 'LOC_TOTAL', 'NUM_OPERANDS'], axis=1)
    result.reset_index(inplace=True, drop=True)
    result.loc[(result['Defective'] == b'N'), 'L'] = 0
    result.loc[(result['Defective'] == b'Y'), 'L'] = 1
    result = result.drop(['Defective'], axis=1)
    # sns.violinplot(np.log(result['EDGE_COUNT']))
    # plt.show()

    a = 0
    b = 0
    for x in range(len(result.index)):
        if result['L'].iloc[x] == 1: b += 1
        if result['L'].iloc[x] == 0: a += 1
    print(a, b)
    return result


getdata()
